[Python] 自動抓取最新搜索結果

前一陣子做的東西,將要搜尋的N筆資料讀進來後,即可輸出一個月內的Yahoo搜尋結果,最近要用到Python就先挖出來放。

利用urllib抓取網頁,利用BeautifulSoup解析資料。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import urllib.error
finput = open('input.txt', 'r', encoding = 'utf-8')
while True :
text = finput.readline()
if text=='':
break
text = text.strip('\n')
urltext = urllib.parse.quote(text)
html = urllib.request.urlopen("https://tw.search.yahoo.com/search;_ylt=A8tUwYpDIQRU33UApv9r1gt.?fr2=time&ei=utf-8&fr=yfp&p="+urltext+"&btf=m").read()
soup = BeautifulSoup(html)
titlelist=[]
linklist=[]
datelist=[]
for title in soup.find_all('ol'):
for title in title.find_all('a','yschttl spt'):
if title.get('data-bns') or title.get('thmbplay'):
continue
else:
titlelist.append(title.get_text())
for link in soup.find_all('ol'):
for link in link.find_all('a','yschttl spt'):
if link.get('data-bns') or link.get('thmbplay'):
continue
else:
linklist.append(link.get('href'))
for date in soup.find_all('ol'):
for date in soup.find_all('span',id='resultTime'):
datelist.append(date.get_text())
foutput = open(text+'.txt', 'w', encoding = 'UTF-8')
i = 0
for i in range(len(titlelist)):
foutput.write(titlelist[i]+'\n')
foutput.write(linklist[i]+'\n')
foutput.write(datelist[i]+'\n')
foutput.write('----------------\n')
foutput.close()
finput.close()